Load Packages¶

In [ ]:
# Reading in, manipulations
import pandas as pd
import numpy as np
import re

# Plotting
import plotly.express as px
import plotly.io as pio
import plotly.figure_factory as ff

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Evaluating the model
from sklearn.metrics import classification_report, confusion_matrix

# Stop warnings
import warnings
warnings.filterwarnings("ignore")

Set a theme¶

In [ ]:
pio.templates.default = "plotly_white"
pio.renderers.default = "notebook"

Read data and manipulate it¶

In [ ]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

Convert from CamelCase to snake

In [ ]:
def to_snake(camel_case_str):
    return ''.join(['_' + i.lower() if i.isupper() else i for i in camel_case_str]).lstrip('_')
df_train.columns = [to_snake(camel_str) for camel_str in df_train.columns]
df_test.columns = [to_snake(camel_str) for camel_str in df_test.columns]

Add names¶

The data is manually inputted and probably not validated, therefore I'm
applying some changes and adding new variables:

  1. Dates and times
  2. Age in days and age in years
  3. Sex (M, F, unknown)
  4. Reproduction
  5. Simplify color names
  6. Has Name (no name versus named)
  7. Mix (is mix versus not a mix)
  8. Age Category (newborn, baby, animal)
  9. Is aggressive breed

Add dates and time measures

In [ ]:
df_train['date_time'] = pd.to_datetime(df_train['date_time'])
df_train['date'] = df_train['date_time'].dt.date
df_train['hour'] = df_train['date_time'].dt.hour

Add months

In [ ]:
df_train['month'] = df_train['date_time'].dt.month
df_train['year_month'] = df_train['date_time'].dt.strftime("%Y/%m")

Convert the age of pets in days

In [ ]:
def convert_to_days(age_str):
    if pd.isna(age_str):
        return None
    match = re.match(r'(\d+)\s*(year|month|week|day)s?', str(age_str))
    if match:
        value, unit = match.groups()
        value = int(value)
        if unit == 'year':
            return value * 365
        elif unit == 'month':
            return value * 30
        elif unit == 'week':
            return value * 7
        elif unit == 'day':
            return value
    return None 

df_train['age_in_days'] = df_train['ageupon_outcome'].apply(convert_to_days)
df_train['age_in_years'] = df_train['age_in_days'] / 365

df_test['age_in_days'] = df_test['ageupon_outcome'].apply(convert_to_days)
df_test['age_in_years'] = df_test['age_in_days'] / 365

Extract sex

In [ ]:
df_train['sex'] = df_train['sexupon_outcome'].str.split().str[-1]
df_train['reproduction'] = df_train['sexupon_outcome'].str.split().str[0].fillna('Unknown')

df_test['sex'] = df_test['sexupon_outcome'].str.split().str[-1]
df_test['reproduction'] = df_test['sexupon_outcome'].str.split().str[0].fillna('Unknown')

Simplify color

In [ ]:
df_train['simple_color'] = df_train['color'].apply(lambda x: x.split('/')[0])
df_test['simple_color'] = df_test['color'].apply(lambda x: x.split('/')[0])

Extract mix from breed

In [ ]:
df_train['is_mix'] = df_train['breed'].str.contains('mix', case = False, na = False)
df_test['is_mix'] = df_test['breed'].str.contains('mix', case = False, na = False)

Add name flag

In [ ]:
df_train['has_name'] = df_train.name.notna()
df_test['has_name'] = df_test.name.notna()

And finally, let's see how many babies and adult animals we have.
This is a bit hard to derive, as this differes by animal and by breed. Also it's a process not an outcome.

Will make it simple for this analysis and have 3 categories:

  • newborn (8 weeks)
  • baby (up to 1 year)
  • adults (from 1 year or more)
In [ ]:
newborn_max_age = 8 * 7  
baby_max_age = 365       

df_train['age_category'] = pd.cut(df_train['age_in_days'], bins = [0, newborn_max_age, baby_max_age, float('inf')],
                                  labels = ['newborn', 'baby', 'adult'], right = False)
df_test['age_category'] = pd.cut(df_test['age_in_days'], bins = [0, newborn_max_age, baby_max_age, float('inf')],
                                  labels = ['newborn', 'baby', 'adult'], right = False)

Now, let's analyze!

In [ ]:
df_train['outcome_type'].value_counts()
Out[ ]:
Adoption           10769
Transfer            9422
Return_to_owner     4786
Euthanasia          1555
Died                 197
Name: outcome_type, dtype: int64
  • The highest share were adopted and transfered.
  • A few animals died, which is a less likely scenario. Predicting such cases is almost impossible.
In [ ]:
df_animals_date = df_train.groupby(['date', 'animal_type']).size().reset_index(name = 'count')
fig = px.line(df_animals_date, x = 'date', y = 'count', color = 'animal_type', title = 'The data starts being tracked more prominently between the end of 2013 and the end of 2015.')
fig.show()
  • The data seems to be more tracked from end of 2013 until end of 2015.
  • The animal shelter gets both cats and dogs, and more likely to shelter dogs.
  • Interestingly, there were some peaks, would have to investigate that later.
In [ ]:
df_outcome_cnt = df_train.groupby(['outcome_type', 'animal_type']).size().reset_index(name = 'count')
df_outcome_cnt['proportion'] = df_outcome_cnt.groupby('animal_type')['count'].transform(lambda x: x / x.sum() * 100)
fig = px.bar(df_outcome_cnt, x = 'animal_type', y = 'proportion', color = 'outcome_type', title = 'Dogs are more likely to be returned to the owner and cats to be transferred.')
fig.show()

Let's check what could be the reason for that.

In [ ]:
fig = px.box(df_train, x = 'animal_type', y = 'age_in_years', log_y = True, title = 'Most sheltered animals are young, especially the cats.') 
fig.show()
In [ ]:
df_animals_date = df_train.groupby(['year_month', 'animal_type', 'age_category']).size().reset_index(name = 'count')
fig = px.line(df_animals_date, x = 'year_month', y = 'count', color = 'age_category', facet_row = 'animal_type', title = 'Quite some peaks! Are these due to transfers?')
fig.show()

Also ...

  • Most cats are newborns or babies.
  • Most dogs that end up in shelters are already adults.
In [ ]:
df_animal_name = df_train.groupby(['animal_type', 'has_name', 'outcome_type']).size().reset_index(name = 'count')
fig = px.bar(df_animal_name, x = 'has_name', y = 'count', color = 'outcome_type', facet_col = 'animal_type', title = 'Having a name leads to better outcomes.')
fig.show()
  • More than 50% of the cats have no name, whereas dogs are less likely to not have one.
  • Dogs with names are more likely to be returned to their owner.
  • Cats with names are more likely to be adopted. There must be something more to that...
In [ ]:
fig = px.box(df_train, x = 'has_name', y = 'age_in_years', facet_col = 'animal_type', title = 'They are also a bit older when having a name.')
fig.show()
In [ ]:
fig = px.box(df_train, x = 'outcome_type', y = 'age_in_years', title = 'When it comes to pets returned to owners, they are more likely already adults.')
fig.show()
  • Most adopted pets are young, a huge part being babies.
  • The adult portion comes more from the dogs side.
In [ ]:
df_repr_outcome = df_train.groupby(['animal_type', 'reproduction', 'outcome_type']).size().reset_index(name = 'count')
df_repr_outcome['proportion'] = df_repr_outcome.groupby(['reproduction', 'animal_type'])['count'].transform(lambda x: x / x.sum() * 100)
fig = px.bar(df_repr_outcome, x = 'reproduction', y = 'proportion', color = 'outcome_type', facet_row = 'animal_type', title = 'Neutered and spayed pets are more likely to be adopted!')
fig.show()
  • It would be a consideration to neuter/spay pets to increase their likelihood of being adopted.
  • Intact are less likely to be adopted.
In [ ]:
fig = px.histogram(df_train, x = 'hour', facet_row = 'outcome_type', facet_col = 'animal_type', height = 1000, width = 800, title = 'Adoption is more likely to happen in the late hours, whereas transfers in the earlier.')
fig.show()
  • Transfers are probably pre-planned and have a clear schedule.
  • Most people work during the day, so probably they would adopt after work with a clear mind.

Modeling with Random Forest¶

  1. Filling NAs
In [ ]:
fig = px.histogram(df_train, x = 'age_in_years')
fig.show()

Will use the median for imputation.

In [ ]:
df_train['age_in_years'].fillna(df_train['age_in_years'].median(), inplace = True)
  1. Finally, one-hot encoding ...
In [ ]:
df_dummies = pd.get_dummies(df_train[['sex', 'reproduction', 'simple_color', 'animal_type', 'age_category', 'breed']], drop_first = True)
df_train = pd.concat([df_dummies, df_train], axis = 1)

df_dummies_test = pd.get_dummies(df_test[['sex', 'reproduction', 'simple_color', 'animal_type', 'age_category', 'breed']], drop_first = True)
df_test = pd.concat([df_dummies_test, df_test], axis = 1)
In [ ]:
X = df_train.drop(columns = ['animal_id', 'outcome_type', 'outcome_subtype', 'animal_type', 
                             'sexupon_outcome', 'ageupon_outcome', 'simple_color', 'breed', 'color', 
                              'hour','date_time', 'date', 'month', 'year_month', 'age_in_days', 
                             'sex', 'reproduction', 'age_category', 'name', 'animal_type_Dog']).values.reshape(-1, 1445)
y = df_train['outcome_type']
X_test = df_test.drop(columns = ['animal_type', 'sexupon_outcome', 'ageupon_outcome', 'simple_color', 'breed', 'color', 
                             'date_time', 'age_in_days','sex', 'reproduction', 'age_category', 'name', 'animal_type_Dog']).values.reshape(-1, 974)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[106], line 6
      1 X = df_train.drop(columns = ['animal_id', 'outcome_type', 'outcome_subtype', 'animal_type', 
      2                              'sexupon_outcome', 'ageupon_outcome', 'simple_color', 'breed', 'color', 
      3                               'hour','date_time', 'date', 'month', 'year_month', 'age_in_days', 
      4                              'sex', 'reproduction', 'age_category', 'name', 'animal_type_Dog']).values.reshape(-1, 1445)
      5 y = df_train['outcome_type']
----> 6 X_test = df_test.drop(columns = ['animal_id', 'outcome_type', 'outcome_subtype', 'animal_type', 
      7                              'sexupon_outcome', 'ageupon_outcome', 'simple_color', 'breed', 'color', 
      8                               'hour','date_time', 'date', 'month', 'year_month', 'age_in_days', 
      9                              'sex', 'reproduction', 'age_category', 'name', 'animal_type_Dog']).values.reshape(-1, 1445).values.reshape(-1, 6)

File c:\Users\Olga\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\util\_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)

File c:\Users\Olga\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\frame.py:5396, in DataFrame.drop(self, labels, axis, index, columns, level, inplace, errors)
   5248 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
   5249 def drop(  # type: ignore[override]
   5250     self,
   (...)
   5257     errors: IgnoreRaise = "raise",
   5258 ) -> DataFrame | None:
   5259     """
   5260     Drop specified labels from rows or columns.
   5261 
   (...)
   5394             weight  1.0     0.8
   5395     """
-> 5396     return super().drop(
   5397         labels=labels,
   5398         axis=axis,
   5399         index=index,
   5400         columns=columns,
   5401         level=level,
   5402         inplace=inplace,
   5403         errors=errors,
   5404     )

File c:\Users\Olga\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\util\_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)

File c:\Users\Olga\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\generic.py:4505, in NDFrame.drop(self, labels, axis, index, columns, level, inplace, errors)
   4503 for axis, labels in axes.items():
   4504     if labels is not None:
-> 4505         obj = obj._drop_axis(labels, axis, level=level, errors=errors)
   4507 if inplace:
   4508     self._update_inplace(obj)

File c:\Users\Olga\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\generic.py:4546, in NDFrame._drop_axis(self, labels, axis, level, errors, only_slice)
   4544         new_axis = axis.drop(labels, level=level, errors=errors)
   4545     else:
-> 4546         new_axis = axis.drop(labels, errors=errors)
   4547     indexer = axis.get_indexer(new_axis)
   4549 # Case for non-unique axis
   4550 else:

File c:\Users\Olga\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\core\indexes\base.py:6977, in Index.drop(self, labels, errors)
   6975 if mask.any():
   6976     if errors != "ignore":
-> 6977         raise KeyError(f"{list(labels[mask])} not found in axis")
   6978     indexer = indexer[~mask]
   6979 return self.delete(indexer)

KeyError: "['animal_id', 'outcome_type', 'outcome_subtype', 'hour', 'date', 'month', 'year_month', 'age_category', 'animal_type_Dog'] not found in axis"
In [ ]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size =  0.2, stratify = y, random_state = 42)
In [ ]:
X_train.shape, y_train.shape
Out[ ]:
((21383, 1445), (21383,))

Modeling in Random Forest ...

In [ ]:
model = RandomForestClassifier(random_state = 42)
model.fit(X_train, y_train)
print("Training Accuracy:", model.score(X_train, y_train))
Training Accuracy: 0.8513772623111818

Validate model

In [ ]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
                 precision    recall  f1-score   support

       Adoption       0.64      0.76      0.69      2154
           Died       0.00      0.00      0.00        39
     Euthanasia       0.32      0.12      0.18       311
Return_to_owner       0.42      0.39      0.41       957
       Transfer       0.66      0.62      0.64      1885

       accuracy                           0.60      5346
      macro avg       0.41      0.38      0.38      5346
   weighted avg       0.58      0.60      0.59      5346

In [ ]:
unique_labels = sorted(y_val.unique())

cm = confusion_matrix(y_val, y_pred, labels = unique_labels)

fig = ff.create_annotated_heatmap(z = cm,
                                  x = unique_labels,
                                  y = unique_labels,
                                  hovertemplate='Count: %{z}',
                                  colorscale='Viridis')
fig.update_layout(xaxis = dict(title = 'Predicted Label'), yaxis = dict(title = 'Actual Label'))
fig.show()
  • Tranfers and Return to Owner are mistaken as Adoption and vice versa.
  • Euthanasia are mistaked as transfers.
  • Death is not something we could predict.

Let's make things more simple for us.
At the end of the day we want the pets to find someone to take care of them.

Let's put the outcomes into smaller groups:

  1. Owner Found = Return to Owner and Adoption
  2. Owner in Search = Transfer
  3. Owner not Found = Euthanasia and Death
In [ ]:
outcome_to_group = {
    'Return_to_owner': 'Owner Found',
    'Adoption': 'Owner Found',
    'Transfer': 'Owner in Search',
    'Euthanasia': 'Owner not Found',
    'Died': 'Owner not Found'
}

# Map outcomes to groups
df_train['outcome'] = df_train['outcome_type'].map(outcome_to_group)
In [ ]:
y = df_train['outcome']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size =  0.2, stratify = y, random_state = 42)
In [ ]:
model.fit(X_train, y_train)
print("Training Accuracy:", model.score(X_train, y_train))
Out[ ]:
0.8954309498199504
In [ ]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
                 precision    recall  f1-score   support

    Owner Found       0.77      0.89      0.82      3111
Owner in Search       0.70      0.59      0.64      1885
Owner not Found       0.35      0.15      0.21       350

       accuracy                           0.74      5346
      macro avg       0.61      0.54      0.56      5346
   weighted avg       0.72      0.74      0.72      5346

In [ ]:
unique_labels = sorted(y_val.unique())

cm = confusion_matrix(y_val, y_pred, labels = unique_labels)

fig = ff.create_annotated_heatmap(z = cm,
                                  x = unique_labels,
                                  y = unique_labels,
                                  hovertemplate='Count: %{z}',
                                  colorscale='Viridis')
fig.update_layout(xaxis = dict(title = 'Predicted Label'), yaxis = dict(title = 'Actual Label'))
fig.show()